Code to clean the data file-by-file

Importing the necessary libraries

In [1]:
import pandas as pd
import csv
import string
import re
import nltk

nltk.download('stopwords')
nltk.download('names')
from nltk.corpus import stopwords
from nltk.corpus import names
from nltk import word_tokenize
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aruna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\Aruna\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

%matplotlib inline
pd.set_option('display.max_colwidth', 150)

(A) Read the CSV File

In [3]:
df = pd.read_csv("C:\\Users\\Aruna\\Documents\\input\\Amazon Lambda.csv")

df['description'] = df['description'].apply(lambda x: " ".join(x for x in str(x).split())) # converting to string
 
df.head(10)
Out[3]:
id label description
0 3160 Amazon Lambda Question about waitForTaskToken With the recent release of the feature where a Step Function can invoke a Lambda with the waitForTaskToken I have ...
1 3159 Amazon Lambda encountering Lambda PolicyLengthExceededException. I'm encountering following error while deploying lambda functions with policies to be invoked f...
2 3159 Amazon Lambda I have encountered this problem too.
3 3159 Amazon Lambda Unfortunately this is a hard limit and cannot be increased. You can remove permissions using the following CLI command and specifying the statemen...
4 3159 Amazon Lambda I posted solution elsewhere, copy-pasting here: I am using gulp-awslambda instead. The error looks to be of AWS lambda instead. ISSUE: Re-deployin...
5 3159 Amazon Lambda Not a good solution and it doesn't work.
6 3158 Amazon Lambda Java 11 support on AWS Lamba? Does anyone have information when we will see AWS Lambda with Java 11? Corretto with Java 11 https://aws.amazon.com/...
7 3157 Amazon Lambda Lambda - python Hi everyone, I'm new to AWS Lambda so I'm writing a simple code using python: def lambda_handler(event, context): data = json.load...
8 3156 Amazon Lambda AWS lambda node.js code to connect to RDS SQLServer Express Problem I'm using the code here: https://stackoverflow.com/questions/33165045/aws-lamb...
9 3156 Amazon Lambda Hello I think you will have to download the library and then send it as part of your lambda function zip file. Here is a tutorial on how to do wha...
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8946 entries, 0 to 8945
Data columns (total 3 columns):
id             8946 non-null int64
label          8946 non-null object
description    8946 non-null object
dtypes: int64(1), object(2)
memory usage: 209.8+ KB

Check out one sample post:

In [5]:
p = 5

df['description'][p]
Out[5]:
"Not a good solution and it doesn't work."

Top 30 words + frequency of each:

In [6]:
pd.Series(' '.join(df['description']).split()).value_counts()[:30]
Out[6]:
the         30335
to          22653
I           13841
a           13319
and         10099
is           9741
in           9068
Lambda       7350
of           6996
for          6780
that         6676
it           6105
this         5621
on           5615
function     5580
you          4837
lambda       4748
with         4716
have         4559
be           4348
{            4089
=            3839
not          3746
my           3518
from         3426
an           3277
as           3249
but          2952
can          2950
at           2942
dtype: int64
In [7]:
print("There are totally", df['description'].apply(lambda x: len(x.split(' '))).sum(), "words before cleaning.")
There are totally 741019 words before cleaning.

(B) Text Pre-processing

In [8]:
STOPWORDS = stopwords.words('english')
my_stop_words = ["hi", "hello", "regards", "thank", "thanks", "regard", "best", "wishes", "hey", "amazon", "aws", "s3",
"elastic", "beanstalk", "rds", "ec2", "lambda", "cloudfront", "cloud", "front", "vpc", "sns", "me",
"january", "february", "march", "april", "may", "june", "july", "august", "september", "october", 
"november", "december", "jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov",
"dec", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "mon", "tue",
"wed", "thu", "fri", "sat", "sun", "ain't", "aren't", "can't", "can't've", "'cause", "could've", "couldn't",
"couldn't've", "didn't", "doesn't", "don't", "hadn't", "hadn't've", "hasn't", "haven't", "he'd", "he'd've",
"he'll", "he'll've", "he's", "how'd", "how'd'y", "how'll", "how's", "i'd", "i'd've", "i'll", "i'll've", "i'm",
"i've", "isn't", "it'd", "it'd've", "it'll", "it'll've", "it's", "let's", "mayn't", "might've", "mightn't",
"mightn't've", "must've", "mustn't", "mustn't've", "needn't", "needn't've", "oughtn't", "oughtn't've", "shan't",
"sha'n't", "shan't've", "she'd", "she'd've", "she'll", "she'll've", "she's", "should've", "shouldn't", "shouldn't've",
"so've", "so's", "that'd", "that'd've", "that's", "there'd", "there'd've", "there's", "they'd", "they'd've", "they'll",
"they'll've", "they're", "they've", "to've", "wasn't", "we'd", "we'd've", "we'll", "we'll've", "we're", "we've",
"weren't", "what'll", "what'll've", "what're", "what's", "what've", "when's", "when've", "where'd", "where's",
"where've", "who'll", "who'll've", "who's", "who've", "why's", "why've", "will've", "won't", "won't've", "would've",
"wouldn't", "wouldn't've", "yall", "yalld", "yalldve", "yallre", "yallve", "youd", "youdve", "youll",
"youllve", "youre", "youve", "do", "did", "does", "had", "have", "has", "could", "can", "as", "is",
"shall", "should", "would", "will", "you", "me", "please", "know", "who", "we", "was", "were", "edited", "by", "pm"]

name = names.words()
STOPWORDS.extend(my_stop_words)
STOPWORDS.extend(name)

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,:;#+?]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z - _.]+')
REMOVE_HTML_RE = re.compile(r'<.*?>')
REMOVE_HTTP_RE = re.compile(r'http\S+')

STOPWORDS = [BAD_SYMBOLS_RE.sub('', x) for x in STOPWORDS]

Convert to lowercase

In [9]:
df['description'] = df['description'].apply(lambda x: " ".join(x.lower() for x in str(x).split(" ")))

df['description'][p]
Out[9]:
"not a good solution and it doesn't work."

Remove all HTML tags

In [10]:
df['description'] = df['description'].apply(lambda x: " ".join(REMOVE_HTML_RE.sub(' ', x) for x in str(x).split()))

df['description'][p]
Out[10]:
"not a good solution and it doesn't work."
In [11]:
df['description'] = df['description'].apply(lambda x: " ".join(REMOVE_HTTP_RE.sub(' ', x) for x in str(x).split()))

df['description'][p]
Out[11]:
"not a good solution and it doesn't work."

Replace certain characters by space (quotation marks, parantheses etc)

In [12]:
df['description'] = df['description'].apply(lambda x: " ".join(REPLACE_BY_SPACE_RE.sub(' ', x) for x in str(x).split()))

df['description'][p]
Out[12]:
"not a good solution and it doesn't work."

Remove any unwanted symbols (like $, @ etc)

In [13]:
df['description'] = df['description'].apply(lambda x: " ".join(BAD_SYMBOLS_RE.sub('', x) for x in str(x).split()))

df['description'][p]
Out[13]:
'not a good solution and it doesnt work.'

Remove trailing punctuation marks and any symbol patterns

In [14]:
df['description'] = df['description'].apply(lambda x: " ".join(x.strip('.') for x in x.split()))
df['description'] = df['description'].apply(lambda x: " ".join(x.strip('-') for x in x.split()))
df['description'] = df['description'].apply(lambda x: " ".join(x.strip('_') for x in x.split()))
df['description'][p]
Out[14]:
'not a good solution and it doesnt work'

Remove any numbers

In [15]:
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if not x.isdigit()))

df['description'][p]
Out[15]:
'not a good solution and it doesnt work'

Remove the stop words

In [16]:
df['description'] = df['description'].apply(lambda x: " ".join(x for x in x.split() if x not in STOPWORDS
                                                               and len(x) > 1))

df['description'][p]
Out[16]:
'good solution work'

Results after cleaning data:

In [17]:
df.head()
Out[17]:
id label description
0 3160 Amazon Lambda question waitfortasktoken recent release feature step function invoke waitfortasktoken couple questions waiting lambdas count accounts total concu...
1 3159 Amazon Lambda encountering policylengthexceededexception encountering following error deploying functions policies invoked apig error occurred policylengthexcee...
2 3159 Amazon Lambda encountered problem
3 3159 Amazon Lambda unfortunately hard limit cannot increased remove permissions using following cli command specifying statement suggest use case better permission w...
4 3159 Amazon Lambda posted solution elsewhere copypasting using gulpawslambda instead error looks instead issue redeploying existing failing solution delete function ...

Top 30 words + frequency of each:

In [18]:
pd.Series(' '.join(df['description']).split()).value_counts()[:30]
Out[18]:
function       8161
error          3058
using          2624
code           2604
var            2147
event          2070
use            2056
file           1980
get            1913
api            1794
request        1789
new            1772
like           1724
functions      1648
time           1628
data           1521
issue          1464
one            1272
see            1257
need           1214
run            1153
console.log    1132
response       1125
message        1120
also           1115
version        1087
problem        1086
gateway        1072
access         1063
way            1061
dtype: int64
In [19]:
print("There are totally", df['description'].apply(lambda x: len(x.split(' '))).sum(), "words after cleaning.")
There are totally 372181 words after cleaning.

(C) Write to CleanText.csv

In [20]:
with open('C:\\Users\\Aruna\\Documents\\ACMS-IID\\input\\CleanText.csv', 'a', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # writer.writerow(['id', 'label', 'description'])
    for i in range(0, len(df['description'])):
        if len(df['description'][i]) > 1:
            writer.writerow([df['id'][i], df['label'][i], df['description'][i]])

(D) Generate the word cloud

In [21]:
msgs = " ".join(str(msg) for msg in df['description'])
fig, ax = plt.subplots(1, 1, figsize  = (100,100))
wordcloud = WordCloud(max_font_size = 20, max_words = 20, background_color = "white").generate(msgs)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
Out[21]:
(-0.5, 399.5, 199.5, -0.5)
In [22]:
msgs = " ".join(str(msg) for msg in df['description'])
fig, ax = plt.subplots(1, 1, figsize  = (100,100))
wordcloud = WordCloud(max_font_size = 20, max_words = 50, background_color = "white").generate(msgs)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
Out[22]:
(-0.5, 399.5, 199.5, -0.5)
In [23]:
msgs = " ".join(str(msg) for msg in df['description'])
fig, ax = plt.subplots(1, 1, figsize  = (100,100))
wordcloud = WordCloud(max_font_size = 20, max_words = 100, background_color = "white").generate(msgs)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
Out[23]:
(-0.5, 399.5, 199.5, -0.5)
In [24]:
msgs = " ".join(str(msg) for msg in df['description'])
fig, ax = plt.subplots(1, 1, figsize  = (100,100))
wordcloud = WordCloud(max_font_size = 20, max_words = 500, background_color = "white").generate(msgs)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
Out[24]:
(-0.5, 399.5, 199.5, -0.5)
In [25]:
msgs = " ".join(str(msg) for msg in df['description'])
fig, ax = plt.subplots(1, 1, figsize  = (100,100))
wordcloud = WordCloud(max_font_size = 20, max_words = 1000, background_color = "white").generate(msgs)
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
Out[25]:
(-0.5, 399.5, 199.5, -0.5)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: